import pandas as pd,os
path="E:\\Projects\\Jigsaw\\Capstone"
os.chdir(path)
os.getcwd()
df=pd.read_excel('Student Applications & Performance.xlsx')
df.shape
df.isnull().sum()
df_remove=pd.DataFrame(round((df.isnull().sum()/df.shape[0])*100,2).sort_values(ascending=False)).reset_index()
df_remove
df_remove=pd.DataFrame(round((df.isnull().sum()/df.shape[0]*100)).nlargest(16)).reset_index()['index'].sort_values(ascending=False)
kp=df.drop(columns=df_remove.tolist(),axis=1,inplace=True)
df.shape
df.columns
df.isnull().sum()
df.columns.to_series().groupby(df.dtypes).groups
# Dropping the columns which not required
# Dropping of columns
df.drop(columns=['STUDENT IDENTIFIER','FATHER_HI_EDU_DESC','MOTHER_HI_EDU_DESC','DEGREE_GROUP_DESC','FIRST_TERM','SECOND_TERM'],axis=1,inplace=True)
df.shape
df.head()
df['FATHER_HI_EDU_CD']=df['FATHER_HI_EDU_CD'].fillna(df['FATHER_HI_EDU_CD'].value_counts().keys()[0])
df['FATHER_HI_EDU_CD'].value_counts().keys()
df['MOTHER_HI_EDU_CD']=df['MOTHER_HI_EDU_CD'].fillna(df['MOTHER_HI_EDU_CD'].value_counts().keys()[0])
df['SECOND_TERM_ATTEMPT_HRS']=df['SECOND_TERM_ATTEMPT_HRS'].fillna(df['SECOND_TERM_ATTEMPT_HRS'].mean())
df['SECOND_TERM_EARNED_HRS']=df['SECOND_TERM_EARNED_HRS'].fillna(df['SECOND_TERM_EARNED_HRS'].mean())
df['STDNT_TEST_ENTRANCE_COMB'].fillna(df['STDNT_TEST_ENTRANCE_COMB'].mean(),inplace=True)
df['DISTANCE_FROM_HOME'].fillna(df['DISTANCE_FROM_HOME'].mean(),inplace=True)
df['HIGH_SCHL_GPA'].fillna(df['HIGH_SCHL_GPA'].mean(),inplace=True)
df['HIGH_SCHL_NAME'].fillna(df['HIGH_SCHL_NAME'].value_counts().keys()[0],inplace=True)
df['CORE_COURSE_NAME_2_F'].fillna("ENGL 1101",inplace=True)
df['CORE_COURSE_GRADE_2_F'].fillna("B",inplace=True)
df['CORE_COURSE_NAME_3_F'].fillna("ENGL 1101",inplace=True)
df['CORE_COURSE_GRADE_3_F'].fillna("B",inplace=True)
df['CORE_COURSE_NAME_1_S'].fillna("ENGL 1102",inplace=True)
df['CORE_COURSE_GRADE_1_S'].fillna("C",inplace=True)
df['CORE_COURSE_NAME_2_S'].fillna("ENGL 1102",inplace=True)
df['CORE_COURSE_GRADE_2_S'].fillna("B",inplace=True)
#Replacing Incompl with Not Rep for CORE_COURSE_GRADE_1_F
df['CORE_COURSE_GRADE_1_F'].replace('INCOMPL','NOT REP',inplace=True)
#Replacing Incompl with Not Rep for CORE_COURSE_GRADE_1_F and 1_S
df['CORE_COURSE_GRADE_2_F'].replace('INCOMPL','NOT REP',inplace=True)
df['CORE_COURSE_GRADE_1_S'].replace('INCOMPL','NOT REP',inplace=True)
df['CORE_COURSE_GRADE_1_S'].unique()
df['CORE_COURSE_GRADE_1_F'].unique()
#Replacing the School names
df['HIGH_SCHL_NAME']=df['HIGH_SCHL_NAME'].str.replace("SCHOOL","")
df['HIGH_SCHL_NAME']=df['HIGH_SCHL_NAME'].astype('int64')
#Replacing the student background
df['STDNT_BACKGROUND']=df['STDNT_BACKGROUND'].str.replace("BGD","")
df['STDNT_BACKGROUND']=df['STDNT_BACKGROUND'].astype('int64')
df['FATHER_HI_EDU_CD']=df['FATHER_HI_EDU_CD'].astype('int64')
#encoding the variables.
#for unmet needs we shall replace the positive numbers as 1, negative numbers as 2 and for other numbers as 0
df['UNMET_NEED'][df['UNMET_NEED']>0]=1
df['UNMET_NEED'][df['UNMET_NEED']<0]=2
df['UNMET_NEED'][df['UNMET_NEED']==0]=0
df.isnull().sum()
# here it is 0 means attriting and 1 means not attriting
# It gives count for number of students left and return
df.RETURNED_2ND_YR.value_counts()
# for model building purpose we are replacing 1 as attriting and 0 as not attriting
df["RETURNED_2ND_YR"].replace([0,1],[1,0], inplace=True)
df.RETURNED_2ND_YR.value_counts()
df.describe()
# Multiple violin plot
import plotly.express as px
fx = px.data.tips()
fig = px.violin(df, y='STDNT_AGE', x='RETURNED_2ND_YR', color='STDNT_GENDER', box=True, points='all', hover_data=df.columns)
fig.show()
df2=df.copy()
# Its new data frame to see all the std who left
df_new=df.copy()
df_new=df_new.query("RETURNED_2ND_YR=='1'")
#df_new only selected left students
px.parallel_coordinates(df_new,dimensions=["EST_FAM_CONTRIBUTION","GROSS_FIN_NEED","UNMET_NEED","COST_OF_ATTEND","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=["red","green","blue"])
px.parallel_categories(df_new,dimensions=["SECOND_TERM_EARNED_HRS","FIRST_TERM_EARNED_HRS","FIRST_TERM_ATTEMPT_HRS","SECOND_TERM_ATTEMPT_HRS","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
px.parallel_categories(df_new,dimensions=["FATHER_HI_EDU_CD","MOTHER_HI_EDU_CD","DEGREE_GROUP_CD","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
px.parallel_coordinates(df_new,dimensions=["HIGH_SCHL_GPA","HIGH_SCHL_NAME","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=["red","green","blue"])
px.parallel_categories(df,dimensions=["IN_STATE_FLAG","INTERNATIONAL_STS","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
px.parallel_categories(df_new,dimensions=["STDNT_AGE","STDNT_GENDER","STDNT_BACKGROUND","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
px.parallel_categories(df_new,dimensions=["STDNT_MAJOR","STDNT_MINOR","STDNT_TEST_ENTRANCE_COMB","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
px.parallel_categories(df_new,dimensions=["CORE_COURSE_NAME_1_F","CORE_COURSE_NAME_2_F","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
px.parallel_categories(df_new,dimensions=["CORE_COURSE_GRADE_1_F","CORE_COURSE_GRADE_2_F","CORE_COURSE_GRADE_3_F","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
px.parallel_categories(df_new,dimensions=["CORE_COURSE_NAME_3_F","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
px.parallel_categories(df_new,dimensions=["CORE_COURSE_NAME_1_S","CORE_COURSE_NAME_2_S","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
px.parallel_categories(df_new,dimensions=["CORE_COURSE_GRADE_1_S","CORE_COURSE_GRADE_2_S","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
px.parallel_categories(df_new,dimensions=["DISTANCE_FROM_HOME","HOUSING_STS","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
# Outliers Detection by ploting Box plot
import plotly.express as px
fx = px.data.tips()
fig = px.violin(df, y='STDNT_AGE',box=True)
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.violin(df, y='STDNT_BACKGROUND',box=True)
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.violin(df, y='STDNT_TEST_ENTRANCE_COMB',box=True)
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='DISTANCE_FROM_HOME')
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='HIGH_SCHL_GPA')
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='HIGH_SCHL_NAME')
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='FATHER_HI_EDU_CD')
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='MOTHER_HI_EDU_CD')
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='SECOND_TERM_EARNED_HRS')
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='FIRST_TERM_EARNED_HRS')
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='GROSS_FIN_NEED')
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='COST_OF_ATTEND')
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='EST_FAM_CONTRIBUTION')
fig.show()
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='UNMET_NEED')
fig.show()
#lets check how many levels each variable has
for col in df.columns:
print(col,':',len(df[col].unique()),'labels')
# Empty list to store columns with categorical data
categorical = []
for col, value in df.iteritems():
if value.dtype == 'object':
categorical.append(col)
categorical
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library
import matplotlib.pyplot as plt
%matplotlib inline
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import time
from subprocess import check_output
#correlation map
f,ax = plt.subplots(figsize=(14, 14))
sns.heatmap(df.corr(), annot=True, fmt= '.1f',ax=ax)
# 50% or greater than 50%
#"COST_OF_ATTEND" is highly correlated with "UNMET_NEED","GROSS_FIN_NEED" we will remove "COST_OF_ATTEND"
#"SECOND_TERM_ATTEMPT_HRS" is highly correlated with "SECOND_TERM_EARNED_HRS" we will keep only one "SECOND_TERM_EARNED_HRS"
#"FIRST_TERM_ATTEMPT_HRS" is highly correlated with "FIRST_TERM_EARNED_HRS" we will keep only one "FIRST_TERM_EARNED_HRS"
df2=df.copy()
#To address this you could plot a correlation matrix and then write down which features are correlated are remove them by hand,
#but there is a smarter way to check the
#correlation coefficient above 0.6. When doing so we should always remove the target variable, for obvious reasons.
correlated_features = set()
correlation_matrix = df2.drop('RETURNED_2ND_YR', axis=1).corr()
for i in range(len(correlation_matrix.columns)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > 0.6:
colname = correlation_matrix.columns[i]
correlated_features.add(colname)
correlated_features
# 'HIGH_SCHL_NAME' is not required and 'SECOND_TERM_ATTEMPT_HRS','FIRST_TERM_ATTEMPT_HRS','GROSS_FIN_NEED','UNMET_NEED'
#drpped because of high correalation
df=df.drop(['HIGH_SCHL_NAME','SECOND_TERM_ATTEMPT_HRS','FIRST_TERM_ATTEMPT_HRS','GROSS_FIN_NEED','UNMET_NEED'],axis=1)
df3=df.copy()
x=df3.drop(["RETURNED_2ND_YR"],axis=1)
y=df3['RETURNED_2ND_YR']
x=pd.get_dummies(x)
print(x.shape)
print(y.shape)
x.columns
def cor_selector(x, y):
cor_list = []
# calculate the correlation with y for each feature
for i in x.columns.tolist():
cor = np.corrcoef(x[i], y)[0, 1]
cor_list.append(cor)
# feature name
cor_feature = x.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
# feature selection? 0 for not select, 1 for select
cor_support = [True if i in cor_feature else False for i in x.columns.tolist()]
return cor_support, cor_feature
cor_support, cor_feature = cor_selector(x, y)
print(str(len(cor_feature)), 'selected features')
cor_feature
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
x_norm = MinMaxScaler().fit_transform(x)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(x_norm, y)
chi_support = chi_selector.get_support()
chi_feature = x.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')
#Wrapper Method
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=1, verbose=5)
rfe_selector.fit(x_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = x.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
rfe_feature
#Embeded Method
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), '1.25*median')
embeded_lr_selector.fit(x_norm,y)
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = x.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(x, y)
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = x.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
embeded_rf_feature
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':x.columns.tolist(), 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
'Random Forest':embeded_rf_support,})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)
#We can also use this for feature selection but it will take more time for printing Accuracy
#for index in range(1,493):
# sel=RFE(GradientBoostingClassifier(n_estimators=100,random_state=0),n_features_to_select=index)
# sel.fit(x_train,y_train)
# x_train_rfe=sel.transform(x_train)
# x_test_rfe=sel.transform(x_test)
# print('Selected Feature:',index)
# run_randomForest(x_train_rfe,x_test_rfe,y_train,y_test)
# print()
count_no_left = len(df3[df3['RETURNED_2ND_YR']==1])
count_join = len(df3[df3['RETURNED_2ND_YR']==0])
pct_of_no_left = count_no_left/(count_no_left+count_join)
print("percentage of no of left is", pct_of_no_left*100)
pct_of_join = count_join/(count_no_left+count_join)
print("percentage of join", pct_of_join*100)
#Our classes are imbalanced, and the ratio of join and left students is 79:21.
df4=df.copy()
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Create a label encoder object--to avoid duplicate
le = LabelEncoder()
# Label Encoding will be used for columns with 2 or less unique values
le_count = 0
for col in df4.columns[1:]:
if df4[col].dtype == 'object':
if len(list(df4[col].unique())) <= 2:
le.fit(df4[col])
df4[col] = le.transform(df4[col])
le_count += 1
print('{} columns were label encoded.'.format(le_count))
df4.head()
df4.columns
df3.head()
X=df4.drop(["RETURNED_2ND_YR"],axis=1)
Y=df4['RETURNED_2ND_YR']
X=pd.get_dummies(X)
import sklearn.model_selection as model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
# Split-out validation dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.20, random_state=0)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('RF', RandomForestClassifier()))
models.append(('xgboost', XGBClassifier(random_state=7)))
models.append(('DT',DecisionTreeClassifier()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
auc_results = []
names = []
for name, model in models:
kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
cv_auc_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='roc_auc')
auc_results.append(cv_auc_results)
# auc_results.append(cv_auc_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_auc_results.mean(), cv_auc_results.std()))
import matplotlib.pyplot as plt
# Compare Algorithms
fig = plt.figure(figsize=(15, 7))
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions Y_train dataset: ", Y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions Y_test dataset: ", Y_test.shape)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
#Accuracy
logreg.score(X_test,Y_test)
Y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))
The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative. The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0. The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important. The support is the number of occurrences of each class in y_test.
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
# generate 2 class dataset
#X, Y = make_classification(n_samples=1000, n_classes=2, random_state=0)
# predict probabilities
lr_probs = logreg.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = logreg.predict(X_test)
lr_precision, lr_recall, _ = precision_recall_curve(Y_test, lr_probs)
lr_f1, lr_auc = f1_score(Y_test, yhat), auc(lr_recall, lr_precision)
# summarize scores
print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
# plot the precision-recall curves
no_skill = len(Y_test[Y_test==1]) / len(Y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(Y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, logreg.predict_proba(X_test)[:,1])
plt.figure(figsize=(14, 6))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=210,oob_score=True,n_jobs=-1,random_state=400)
clf.fit(X_train,Y_train)
print(clf.oob_score_)
print(clf.score(X_train,Y_train))
clf.score(X_test,Y_test)
Y_pred = clf.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
# generate 2 class dataset
#X, Y = make_classification(n_samples=1000, n_classes=2, random_state=0)
# predict probabilities
rf_probs = clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
rf_probs = rf_probs[:, 1]
# predict class values
yhat = clf.predict(X_test)
rf_precision, rf_recall, _ = precision_recall_curve(Y_test, rf_probs)
rf_f1, rf_auc = f1_score(Y_test, yhat), auc(rf_recall, rf_precision)
# summarize scores
print('RandomForest: f1=%.3f auc=%.3f' % (rf_f1, rf_auc))
# plot the precision-recall curves
no_skill = len(Y_test[Y_test==1]) / len(Y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='RandomForest')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
clf_roc_auc = roc_auc_score(Y_test, clf.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, clf.predict_proba(X_test)[:,1])
plt.figure(figsize=(14, 6))
plt.plot(fpr, tpr, label='Random Forest (area = %0.2f)' % clf_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('RF_ROC')
plt.show()
from sklearn.ensemble import GradientBoostingClassifier
gb_clf=GradientBoostingClassifier(n_estimators=80,random_state=400)
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(gb_clf,param_grid={'n_estimators':[80,180,200,220,240,260,280,300]})
mod.fit(X_train,Y_train)
mod.best_estimator_
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(n_estimators=80,random_state=400)
xgb_clf.fit(X_train, Y_train)
score = xgb_clf.score(X_test,Y_test)
print(score)
Y_pred = xgb_clf.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
# generate 2 class dataset
#X, Y = make_classification(n_samples=1000, n_classes=2, random_state=0)
# predict probabilities
xgb_probs =xgb_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
xgb_probs = xgb_probs[:, 1]
# predict class values
yhat = xgb_clf.predict(X_test)
xgb_precision, xgb_recall, _ = precision_recall_curve(Y_test, xgb_probs)
xgb_f1, xgb_auc = f1_score(Y_test, yhat), auc(xgb_recall, xgb_precision)
# summarize scores
print('Xgboost: f1=%.3f auc=%.3f' % (xgb_f1, xgb_auc))
# plot the precision-recall curves
no_skill = len(Y_test[Y_test==1]) / len(Y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(xgb_recall, xgb_precision, marker='.', label='XGBoost')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
xgb_clf_roc_auc = roc_auc_score(Y_test, xgb_clf.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, xgb_clf.predict_proba(X_test)[:,1])
plt.figure(figsize=(14, 6))
plt.plot(fpr, tpr, label='XGBoost (area = %0.2f)' % xgb_clf_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('xgb_ROC')
plt.show()
mod1=GridSearchCV(xgb_clf,param_grid={'n_estimators':[80,100,120,140,160,180,200,220,240,260,280,300]})
mod1.fit(X_train,Y_train)
mod1.best_estimator_
xgb_feat=pd.Series(xgb_clf.feature_importances_,index=X_train.columns).sort_values(ascending=False).reset_index()
xgb_feat1=xgb_feat[xgb_feat[0]>0]
xgb_feat1
import plotly.express as px
xgb_feat1=xgb_feat1.rename(columns={0:'feature importance'})
px.bar(xgb_feat1,x='index',y='feature importance',height=900)
import sklearn.tree as tree
DT=tree.DecisionTreeClassifier(max_depth=3,random_state=200)
DT.fit(X_train,Y_train)
DT.score(X_train,Y_train)
DT.score(X_test,Y_test)
Y_pred = DT.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
# generate 2 class dataset
#X, Y = make_classification(n_samples=1000, n_classes=2, random_state=0)
# predict probabilities
DT_probs =DT.predict_proba(X_test)
# keep probabilities for the positive outcome only
DT_probs = DT_probs[:, 1]
# predict class values
yhat = DT.predict(X_test)
DT_precision, DT_recall, _ = precision_recall_curve(Y_test, DT_probs)
DT_f1, DT_auc = f1_score(Y_test, yhat), auc(DT_recall, DT_precision)
# summarize scores
print('Decision Tree: f1=%.3f auc=%.3f' % (DT_f1, DT_auc))
# plot the precision-recall curves
no_skill = len(Y_test[Y_test==1]) / len(Y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(DT_recall, DT_precision, marker='.', label='Decision Tree')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
DT_roc_auc = roc_auc_score(Y_test, DT.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, DT.predict_proba(X_test)[:,1])
plt.figure(figsize=(14, 6))
plt.plot(fpr, tpr, label='Decision Tree (area = %0.2f)' % DT_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('DT_ROC')
plt.show()
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(DT,param_grid={'max_depth':[3,7,8,9]})
mod.fit(X_train,Y_train)
mod.best_estimator_
#best depth=3
best_feat=pd.DataFrame({'Features':X_train.columns,'Importance':clf.feature_importances_})
best_feat.sort_values('Importance',ascending=0).head(100).reset_index()